notebook.community

Edit and run



In [ ]:

    
import plotly.tools as tls
import pandas as pd
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display

import plotly.plotly as py # interactive graphing
import plotly.graph_objs as go
from plotly.graph_objs import Bar, Scatter, Marker, Layout, Histogram, Box
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline



In [ ]:

    
import psycopg2 as pg
#load python script that batch loads pandas df to sql
import cStringIO



In [ ]:

    
#py.sign_in('bottydim', 'o1kuyms9zv') 
init_notebook_mode()



In [ ]:

    
data_dir = './data/'
evt_name = 'Featurespace_events_output.csv'
auth_name = 'Featurespace_auths_output.csv'
db_name = 'c1_agg.db'
db_name = 'ccfd.db'



In [ ]:

    
# chunk-by-chunk append to sql
df_evnt = pd.read_csv(data_dir+evt_name)



In [ ]:

    
df_evnt



In [ ]:

    
df_evnt.isnull().sum()



In [ ]:

    
df_evnt = df
df_evnt.columns



In [ ]:

    
len(df_evnt.columns)



In [ ]:

    
df_evnt['AUTHZN_MSG_TYPE_CD'].unique()



In [ ]:

    
df.columns



In [ ]:

    
len(df_evnt.columns)



In [ ]:

    
df.loc[:,'Dahead().apply(lambda x: getTime(x),1)



In [ ]:

    
!wc -l < {data_dir+evt_name} # Number of lines in dataset



In [ ]:

    
!wc -l < {data_dir+auth_name}

107,565,191



In [ ]:

    
display(pd.read_csv(data_dir+auth_name, nrows=2).head())



In [ ]:

    
disk_engine = create_engine('sqlite:///'+data_dir+db_name,convert_unicode=True)
disk_engine.raw_connection().connection.text_factory = str
# Initializes database with filename in current directory



In [ ]:

    
# engine = create_engine(
#     "postgresql+pg8000://botty:botty@localhost/ccfd",
#     isolation_level="AUTOCOMMIT"
# )

engine = create_engine(
    "postgresql://script@localhost:5432/ccfd",
    isolation_level="AUTOCOMMIT",
    pool_size=20
)
disk_engine = engine



In [ ]:

    
table = 'event'



In [ ]:

    
create_qry = ' DROP TABLE IF EXISTS {table}; \
CREATE TABLE {table} \
( \
  index bigint, \
  acct_id text, \
  "AUTHZN_RQST_PROC_TM" timestamp without time zone, \
  "AUTHZN_APPRL_CD" text, \
  "AUTHZN_AMT" double precision, \
  "MRCH_NM" text, \
  "MRCH_CITY_NM" text, \
  "MRCH_PSTL_CD" double precision, \
  "MRCH_CNTRY_CD" text, \
  "MRCH_ID" text, \
  "TRMNL_ID" double precision, \
  "MRCH_CATG_CD" text, \
  "POS_ENTRY_MTHD_CD" double precision, \
  "POS_COND_CD" double precision, \
  "TRMNL_CLASFN_CD" double precision, \
  "TRMNL_CAPBLT_CD" double precision, \
  "TRMNL_PIN_CAPBLT_CD" double precision, \
  "TSYS_DCLN_REAS_CD" double precision, \
  "MRCH_TMP_PRTPN_IND" text, \
  "AUTHZN_MSG_TYPE_MODR_CD" text, \
  "AUTHZN_ACCT_STAT_CD" text, \
  "AUTHZN_MSG_TYPE_CD" bigint, \
  "AUTHZN_RQST_TYPE_CD" bigint, \
  "AUTHZN_RESPNS_CD" bigint, \
  "ACCT_STAT_REAS_NUM" bigint, \
  "RQST_CARD_SEQ_NUM" text, \
  "PIN_OFST_IND" bigint, \
  "PIN_VLDTN_IND" text, \
  "CARD_VFCN_REJ_CD" double precision, \
  "CARD_VFCN_RESPNS_CD" text, \
  "CARD_VFCN2_RESPNS_CD" text, \
  "CAVV_CD" double precision, \
  "ECMRC_SCURT_CD" text, \
  "ACQR_BIN_NUM" text, \
  "ACQR_CRCY_CD" double precision, \
  "CRCY_CNVRSN_RT" bigint, \
  "AUTHZN_APPRD_AMT" double precision, \
  "PRIOR_MONEY_AVL_AMT" double precision, \
  "PRIOR_CASH_AVL_AMT" double precision, \
  "ACCT_CL_AMT" double precision, \
  "ACCT_CURR_BAL" double precision, \
  "PREV_ADR_CHNG_DT" timestamp without time zone, \
  "PREV_PMT_DT" timestamp without time zone, \
  "PREV_PMT_AMT" double precision, \
  "PREV_CARD_RQST_DT" timestamp without time zone, \
  "FRD_IND" text, \
  "FRD_IND_SWT_DT" timestamp without time zone \
);'.format(table=table)



In [ ]:

    
# address = 'postgresql://<username>:<pswd>@<host>:<port>/<database>'
address = 'postgresql://script@localhost:5432/ccfd'
engine = create_engine(address)
connection = engine.raw_connection()
cursor = connection.cursor()



In [ ]:

    
for c,name in enumerate(col_names):
    if name =='index':
        continue
    cursor.execute('''CREATE INDEX id_auth_{col} 
                ON {table} ({col})'''.format(table=table,col=name))
connection.commit()
print 'idxs created!'



In [ ]:

    
table = 'auth'
cursor.execute('''CREATE INDEX id_{table}_acct_id_tm
                ON {table} (acct_id,AUTHZN_RQST_PROC_TM)'''.format(table=table))
cursor.execute('''CREATE INDEX id_{table}_tm_frd 
                ON {table} (AUTHZN_RQST_PROC_TM,FRD_IND_SWT_DT)'''.format(table=table))
connection.commit()



In [ ]:

    
cursor.__dict__



In [ ]:

    
cursor.execute(create_qry)
connection.commit()



In [ ]:

    
with engine.connect() as conn:
#     conn.execute('drop TABLE event;')



In [ ]:

    
with engine.connect() as conn:
     conn.execute('CREATE TABLE table_name ( \
    column_name1 col_type (field_length) column_constraints,\
    column_name2 col_type (field_length),\
    column_name3 col_type (field_length));')



In [ ]:

    
print disk_engine.raw_connection().connection.text_factory
disk_engine.raw_connection().connection.text_factory = lambda x: unicode(x, 'utf-8', 'ignore')
print disk_engine.raw_connection().connection.text_factory



In [ ]:

    
df_trim = pd.read_sql_query('SELECT * FROM {} LIMIT 10000'.format(table), disk_engine)



In [ ]:

    
df_trim.head()



In [ ]:

    
table = 'event'
# df_trim.to_sql(table, disk_engine, if_exists='replace')



In [ ]:

    
####################CSV transfer!!!!!!!!!!!
start = dt.datetime.now()
chunksize = 300000
j = 0
index_start = 1
###################data source
file_loc = data_dir+evt_name
########################
dtFormat = "%d%b%Y %H:%M:%S.%f"
def getTime(x):
    dtString = "{} {}".format(x.AUTHZN_RQST_PROC_DT,x.AUTHZN_RQST_PROC_TM)
    return dt.datetime.strptime(dtString,dtFormat)

for df in pd.read_csv(file_loc, chunksize=chunksize, iterator=True,encoding='ISO-8859-1'):
    
    df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns

#     df['AUTHZN_RQST_PROC_DT'] = pd.to_datetime(df['AUTHZN_RQST_PROC_DT'],format='%d%b%Y') # Convert to datetimes
#     df['AUTHZN_RQST_PROC_TM'] = df['AUTHZN_RQST_PROC_DT']+ pd.to_datetime(df.AUTHZN_RQST_PROC_TM).dt.time
    df['acct_id'] = df['acct_id'].astype(str)
    df['AUTHZN_RQST_PROC_TM'] = df.apply(lambda x: getTime(x),1)
    df['AUTHZN_APPRL_CD'] =pd.to_numeric(df['AUTHZN_APPRL_CD'], errors='coerce')
    df['AUTHZN_APPRL_CD'] =df['AUTHZN_APPRL_CD'].astype(str)
    df.MRCH_CNTRY_CD = df.MRCH_CNTRY_CD.astype(str)
    df.MRCH_CATG_CD = df.MRCH_CATG_CD.astype(str)
    df.AUTHZN_MSG_TYPE_MODR_CD = df.AUTHZN_MSG_TYPE_MODR_CD.astype(str)
    df.RQST_CARD_SEQ_NUM = df.RQST_CARD_SEQ_NUM.astype(str)
    df.ECMRC_SCURT_CD = df.ECMRC_SCURT_CD.astype(str)
    df.ACQR_BIN_NUM = df.ACQR_BIN_NUM.astype(str)
    df.PREV_ADR_CHNG_DT =pd.to_datetime(df.PREV_ADR_CHNG_DT,errors='coerce',format='%d%b%Y')
    df.PREV_PMT_DT = pd.to_datetime(df.PREV_PMT_DT,errors='coerce',format='%d%b%Y')
    df.PREV_CARD_RQST_DT = pd.to_datetime(df.PREV_CARD_RQST_DT,errors='coerce',format='%d%b%Y')
    df.FRD_IND_SWT_DT = pd.to_datetime(df.FRD_IND_SWT_DT,errors='coerce',format='%d%b%Y')
#     df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df[['AUTHZN_RQST_PROC_DT','AUTHZN_RQST_PROC_TM']],format='%Y%m%d%H')
#     df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df.AUTHZN_RQST_PROC_DT.dt.strftime('%Y-%m-%d') +' '+ df.AUTHZN_RQST_PROC_TM.dt.strftime('%H'))
#     df['PREV_ADR_CHNG_DT'] = pd.to_datetime(df['PREV_ADR_CHNG_DT'])
#     df['PREV_PMT_DT'] = pd.to_datetime(df['PREV_PMT_DT'])
#     df['PREV_CARD_RQST_DT'] = pd.to_datetime(df['PREV_CARD_RQST_DT'])
#     df['FRD_IND_SWT_DT'] = pd.to_datetime(df['FRD_IND_SWT_DT'])
    df.index += index_start

    # Remove the un-interesting columns
    columns = ['AUTHZN_RQST_PROC_DT','EXCSV_ACTVY_PARM_CD']

    for c in df.columns:
        if c in columns:
            df = df.drop(c, axis=1)    

    
    j+=1
    t_mid = dt.datetime.now()
    print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
    
#     display(df)
#     print df.dtypes
    
#     break
#     table = 'data_trim'
    output = cStringIO.StringIO()
    #ignore the index
    df.to_csv(output, sep='\t', header=False, index=False)
    #jump to start of stream
    output.seek(0)
    contents = output.getvalue()
    cur = connection.cursor()
    #null values become ''
    cur.copy_from(output,table, null="")    
    connection.commit()
    cur.close()
    print '{} seconds: inserted {} rows'.format((dt.datetime.now() - t_mid).seconds, j*chunksize)
    index_start = df.index[-1] + 1



In [ ]:

    
start = dt.datetime.now()
chunksize = 300000
j = 0
index_start = 1
###################data source
file_loc = data_dir+evt_name
########################
dtFormat = "%d%b%Y %H:%M:%S.%f"
def getTime(x):
    dtString = "{} {}".format(x.AUTHZN_RQST_PROC_DT,x.AUTHZN_RQST_PROC_TM)
    return dt.datetime.strptime(dtString,dtFormat)

for df in pd.read_csv(file_loc, chunksize=chunksize, iterator=True,encoding='ISO-8859-1'):
    
    df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns

#     df['AUTHZN_RQST_PROC_DT'] = pd.to_datetime(df['AUTHZN_RQST_PROC_DT'],format='%d%b%Y') # Convert to datetimes
#     df['AUTHZN_RQST_PROC_TM'] = df['AUTHZN_RQST_PROC_DT']+ pd.to_datetime(df.AUTHZN_RQST_PROC_TM).dt.time
    df['acct_id'] = df['acct_id'].astype(str)
    df['AUTHZN_RQST_PROC_TM'] = df.apply(lambda x: getTime(x),1)
    df['AUTHZN_APPRL_CD'] =pd.to_numeric(df['AUTHZN_APPRL_CD'], errors='coerce')
    df['AUTHZN_APPRL_CD'] =df['AUTHZN_APPRL_CD'].astype(str)
    df.MRCH_CNTRY_CD = df.MRCH_CNTRY_CD.astype(str)
    df.MRCH_CATG_CD = df.MRCH_CATG_CD.astype(str)
    df.AUTHZN_MSG_TYPE_MODR_CD = df.AUTHZN_MSG_TYPE_MODR_CD.astype(str)
    df.RQST_CARD_SEQ_NUM = df.RQST_CARD_SEQ_NUM.astype(str)
    df.ECMRC_SCURT_CD = df.ECMRC_SCURT_CD.astype(str)
    df.ACQR_BIN_NUM = df.ACQR_BIN_NUM.astype(str)
    df.PREV_ADR_CHNG_DT =pd.to_datetime(df.PREV_ADR_CHNG_DT,errors='coerce',format='%d%b%Y')
    df.PREV_PMT_DT = pd.to_datetime(df.PREV_PMT_DT,errors='coerce',format='%d%b%Y')
    df.PREV_CARD_RQST_DT = pd.to_datetime(df.PREV_CARD_RQST_DT,errors='coerce',format='%d%b%Y')
    df.FRD_IND_SWT_DT = pd.to_datetime(df.FRD_IND_SWT_DT,errors='coerce',format='%d%b%Y')
#     df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df[['AUTHZN_RQST_PROC_DT','AUTHZN_RQST_PROC_TM']],format='%Y%m%d%H')
#     df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df.AUTHZN_RQST_PROC_DT.dt.strftime('%Y-%m-%d') +' '+ df.AUTHZN_RQST_PROC_TM.dt.strftime('%H'))
#     df['PREV_ADR_CHNG_DT'] = pd.to_datetime(df['PREV_ADR_CHNG_DT'])
#     df['PREV_PMT_DT'] = pd.to_datetime(df['PREV_PMT_DT'])
#     df['PREV_CARD_RQST_DT'] = pd.to_datetime(df['PREV_CARD_RQST_DT'])
#     df['FRD_IND_SWT_DT'] = pd.to_datetime(df['FRD_IND_SWT_DT'])
    df.index += index_start

    # Remove the un-interesting columns
    columns = ['AUTHZN_RQST_PROC_DT','EXCSV_ACTVY_PARM_CD']

    for c in df.columns:
        if c in columns:
            df = df.drop(c, axis=1)    

    
    j+=1
    t_mid = dt.datetime.now()
    print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
    
#     display(df)
#     print df.dtypes
    
#     break
#     table = 'data_trim'
    df.to_sql(table, disk_engine, if_exists='append')
    print '{} seconds: inserted {} rows'.format((dt.datetime.now() - t_mid).seconds, j*chunksize)
    index_start = df.index[-1] + 1



In [ ]:

    
import sqlite3
def init_sqlite3(dbname):
    conn = sqlite3.connect(dbname)
    return conn
disk_engine =  init_sqlite3(data_dir+db_name)



In [ ]:

    
drop_qry = '''DROP TABLE IF EXISTS {table}; '''


table = 'auth'
cursor.execute(drop_qry.format(table=table))
connection.commit()



In [ ]:

    
table = 'auth'
start = dt.datetime.now()
chunksize = 300000
j = 0
index_start = 1
file_loc = data_dir+auth_name
dtFormat = "%d%b%Y %H:%M:%S.%f"
def getTime(x):
    dtString = "{} {}".format(x.AUTHZN_RQST_PROC_DT,x.AUTHZN_RQST_PROC_TM)
    return dt.datetime.strptime(dtString,dtFormat)

for df in pd.read_csv(file_loc, chunksize=chunksize, iterator=True,encoding='ISO-8859-1'):
    
    df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns

#     df['AUTHZN_RQST_PROC_DT'] = pd.to_datetime(df['AUTHZN_RQST_PROC_DT'],format='%d%b%Y') # Convert to datetimes
#     df['AUTHZN_RQST_PROC_TM'] = df['AUTHZN_RQST_PROC_DT']+ pd.to_datetime(df.AUTHZN_RQST_PROC_TM).dt.time
    df['AUTHZN_RQST_PROC_TM'] = df.apply(lambda x: getTime(x),1)
    df['AUTHZN_APPRL_CD'] =pd.to_numeric(df['AUTHZN_APPRL_CD'], errors='coerce')
    df['AUTHZN_APPRL_CD'] =df['AUTHZN_APPRL_CD'].astype(str)
    df.MRCH_CNTRY_CD = df.MRCH_CNTRY_CD.astype(str)
    df.MRCH_CATG_CD = df.MRCH_CATG_CD.astype(str)
    df.AUTHZN_MSG_TYPE_MODR_CD = df.AUTHZN_MSG_TYPE_MODR_CD.astype(str)
    df.RQST_CARD_SEQ_NUM = df.RQST_CARD_SEQ_NUM.astype(str)
    df.ECMRC_SCURT_CD = df.ECMRC_SCURT_CD.astype(str)
    df.ACQR_BIN_NUM = df.ACQR_BIN_NUM.astype(str)
    df.PREV_ADR_CHNG_DT =pd.to_datetime(df.PREV_ADR_CHNG_DT,errors='coerce',format='%d%b%Y')
    df.PREV_PMT_DT = pd.to_datetime(df.PREV_PMT_DT,errors='coerce',format='%d%b%Y')
    df.PREV_CARD_RQST_DT = pd.to_datetime(df.PREV_CARD_RQST_DT,errors='coerce',format='%d%b%Y')
    df.FRD_IND_SWT_DT = pd.to_datetime(df.FRD_IND_SWT_DT,errors='coerce',format='%d%b%Y')
#     df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df[['AUTHZN_RQST_PROC_DT','AUTHZN_RQST_PROC_TM']],format='%Y%m%d%H')
#     df['AUTHZN_RQST_PROC_TM'] = pd.to_datetime(df.AUTHZN_RQST_PROC_DT.dt.strftime('%Y-%m-%d') +' '+ df.AUTHZN_RQST_PROC_TM.dt.strftime('%H'))
#     df['PREV_ADR_CHNG_DT'] = pd.to_datetime(df['PREV_ADR_CHNG_DT'])
#     df['PREV_PMT_DT'] = pd.to_datetime(df['PREV_PMT_DT'])
#     df['PREV_CARD_RQST_DT'] = pd.to_datetime(df['PREV_CARD_RQST_DT'])
#     df['FRD_IND_SWT_DT'] = pd.to_datetime(df['FRD_IND_SWT_DT'])
    df.index += index_start

    # Remove the un-interesting columns
    columns = ['AUTHZN_RQST_PROC_DT','EXCSV_ACTVY_PARM_CD']

    for c in df.columns:
        if c in columns:
            df = df.drop(c, axis=1)    

    
    j+=1
    print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
#     display(df)
#     print df.dtypes
    
#     break
    
    df.to_sql(table, disk_engine, if_exists='append')
    index_start = df.index[-1] + 1



In [ ]:



In [ ]:

Preview Table



In [ ]:

    
table = 'auth'



In [ ]:

    
df_total = pd.read_sql_query('SELECT count(*) FROM {}'.format(table), disk_engine)
df_total.head()



In [ ]:

    
df_total_F = pd.read_sql_query('SELECT count(*) FROM {}  where FRD_IND = "Y"'.format(table), disk_engine)
df_total_F.head()



In [ ]:

    
df_total_F_G = pd.read_sql_query('SELECT FRD_IND, count(*) as tr_num FROM {}  group by FRD_IND'.format(table), disk_engine)
df_total_F_G.head()



In [ ]:

    
title = 'Fraud Distribution'
fig = {
    'data': [{'labels': ['Fraud', 'Genuine'],
              'values': [df_total_F_G['tr_num'][1], df_total_F_G['tr_num'][0]],
              'type': 'pie'}],
    'layout': {'title': title}
     }
iplot(fig,filename='figures/'+title)



In [ ]:

    
df = pd.read_sql_query('SELECT * FROM {} LIMIT 3'.format(table), disk_engine)
df.head()

Number of unique users



In [ ]:

    
col = 'acct_id'
df_dst_u = pd.read_sql_query('SELECT COUNT(DISTINCT {}) FROM {}'.format(col,table), disk_engine)
df_dst_u



In [ ]:

    
df_dst_u

Fraud Transaction Distribution

select FRD_IND, count(*) from data group by FRD_IND;



In [ ]:

    
col = 'FRD_IND'
df = pd.read_sql_query('SELECT {0}, COUNT(*) as `num_fraud`'
                       'FROM {} '
                       'GROUP BY {0}' 
                       'ORDER BY `num_trans` DESC'.format(col,table), disk_engine)

df.head()

Infringed Users



In [ ]:

    
df_infr = pd.read_sql_query('select distinct FRD_IND,count(distinct acct_id) as num_usr '
                       'from {table} '
                       'group by FRD_IND'.format(table=table), disk_engine)
df_infr



In [ ]:

    
title = 'Fraud by Distinct Users'
fig = {
    'data': [{'labels': ['Fraud', 'Genuine'],
              'values': [df_infr['num_usr'][1], df_infr['num_usr'][0]],
              'type': 'pie'}],
    'layout': {'title': title}
     }
iplot(fig,filename='figures/'+title)



In [ ]:

    
usr_ratio = df_infr['num_usr'][0]/ df_infr['num_usr'][1]
usr_ratio

Transactions per user

select acct_id, count(*) as num_trans from data group by acct_id order by num_trans DESC;



In [ ]:

    
col = '*'
df = pd.read_sql_query('SELECT {0} '
                       'FROM {1} '
                       'where acct_id = "."'.format(col,table), disk_engine)

df.head(8)



In [ ]:

    
col = 'acct_id'
df_u_t = pd.read_sql_query('SELECT {0}, COUNT(*) as `num_trans` '
                       'FROM {1} '
                       'GROUP BY {0} ' 
                       'ORDER BY `num_trans` DESC'.format(col,table), disk_engine)

df_u_t.head(8)



In [ ]:

    
title = 'Transactions Histogram'
data = [
    Histogram(
        x=df_u_t['num_trans']
    )
]

iplot(data,filename='figures/'+title)



In [ ]:



In [ ]:

    
df_u_t['num_trans'].median()



In [ ]:

    
df_u_t['num_trans'].mode()



In [ ]:

    
df_u_t['num_trans'].describe()



In [ ]:

    
df_u_t.boxplot(['num_trans'])



In [ ]:

    
trace0 = Box(
    y=df_u_t['num_trans']
)

data = [trace0]
py.iplot(data)



In [ ]:

    
df_u_t.groupby('num_trans').count()

Fraud Transactions per User

select acct_id, count(*) as num_fraud from data where FRD_IND = "Y" group by acct_id order by num_fraud DESC;



In [ ]:

    
col = 'acct_id'
df_u_ft = pd.read_sql_query('select {0}, count(*) as num_fraud '
                       'from {1} where FRD_IND = "Y" '
                       'group by {0} order by num_fraud DESC'.format(col,table), disk_engine)

df_u_ft.head()

Distribution of Label Confirmation



In [ ]:

    
col = 'FRD_IND_SWT_DT'
count_nm = 'num_dates'
df_l_t = pd.read_sql_query('select {0}, count(*) as {2} '
                       'from {1} where FRD_IND = "Y" '
                       'group by {0} order by {2} DESC'.format(col,table,count_nm), disk_engine)

df_l_t.head()



In [ ]:

    
df_l_t['num_dates'].describe()



In [ ]:

    
col = 'FRD_IND_SWT_DT'
count_nm = 'num_dates'
df_l_t_all = pd.read_sql_query('select {0} '
                       'from {1} where FRD_IND = "Y" '
                       'order by {0} ASC'.format(col,table,count_nm), disk_engine)

df_l_t_all.head()



In [ ]:

    
title = 'Dates Histogram'
data = [
    Histogram(
        x=df_l_t_all['FRD_IND_SWT_DT']
    )
]

iplot(data,filename='figures/'+title)



In [ ]:

    
df_l_t_all['FRD_IND_SWT_DT'].describe()



In [ ]:

    
df_l_t_all[pd.to_datetime(df_l_t_all['FRD_IND_SWT_DT'])>pd.to_datetime('2014-05-01')].shape



In [ ]:

    
frac = df_l_t_all[pd.to_datetime(df_l_t_all['FRD_IND_SWT_DT'])>pd.to_datetime('2014-05-01')].count()[0]



In [ ]:

    
total_dates = df_l_t_all.count()[0]



In [ ]:

    
frac*1.0/total_dates



In [ ]:

    
title = 'Confiramtions per Date'
trace0 = Bar(
    x= df_l_t['FRD_IND_SWT_DT'],
    y=df_l_t['num_dates'],
#     text=['27% market share', '24% market share', '19% market share'],
#     marker=dict(
#         color='rgb(158,202,225)',
#         line=dict(
#             color='rgb(8,48,107)',
#             width=1.5,
#         )
#     ),
#     opacity=0.6
)

data = [trace0]
layout = go.Layout(
    title=title,
)

fig = go.Figure(data=data, layout=layout)
iplot(data,filename='figures/'+title)

Transaction Data Distribution



In [ ]:

    
col = 'AUTHZN_RQST_PROC_TM'
count_nm = 'num_dates'
df_t_time = pd.read_sql_query('select {0} '
                       'from {1} where FRD_IND = "Y" '
                       'order by {0} ASC'.format(col,table,count_nm), disk_engine)

df_t_time.head()



In [ ]:

    
table = 'auth'
subset_df =  pd.read_sql_query('select * '
                       'from {table} limit 10000'.format(table=table), disk_engine)



In [ ]:

    
pclass_xt = pd.crosstab(subset_df['acct_id'], subset_df['FRD_IND'])
display(pclass_xt)



In [ ]:

    
pclass_xt.drop(pclass_xt.index[[np.arange(10,200)]],inplace=True)



In [ ]:

    
pclass_xt



In [ ]:

    
pclass_xt_pct = pclass_xt.div(pclass_xt.sum(1).astype(float), axis=0)
pclass_xt_pct
pclass_xt_pct.plot(kind='bar', 
                   stacked=True, 
                   title='Distribution of transactions per user')
plt.xlabel('Genuine')
plt.ylabel('Fraud')

Fraud in-betweeen genuine



In [ ]:

    
select acct_id, AUTHZN_RQST_PROC_TM,FRD_IND from data where acct_id = 337018623;

Number of infringed users



In [ ]:

    
select FRD_IND, count(distinct acct_id) from data group by FRD_IND;

Encode Column



In [ ]:

    
def encode_column(df_col):



In [ ]:

    
encoders = {}
for c,r in enumerate(df):
    tp = df.dtypes[c]
    if tp == 'object':
        encoders[r] = encode_column(df[r])
encoders

Stratified subsample



In [ ]:

    
df_ds_u = pd.read_sql_query('select distinct acct_id, FRD_IND '
                       'from {table} '
                       'order by FRD_IND'.format(table=table), disk_engine)
df_ds_u



In [ ]:

    
users = set()
cnt = 0
head = 0
tail = len(df_ds_u.acct_id)-1
sample_size = tail
for i in range(sample_size):
    
    if cnt<usr_ratio:
        users.add(df_ds_u.acct_id[head])
        cnt+=1
        head+=1
    else:
        users.add(df_ds_u.acct_id[tail])
        tail-=1
        cnt=0



In [ ]:

    
def generate_sequence(user,table):
    df = get_user_info(user,table)
    for



In [ ]:

    
sequnences = []
for user in users:
    sequences.append(generate_sequence(user,table))



In [ ]:

    
for r in df.dtypes:
    print r



In [ ]:

    
import keras



In [ ]:

    
(1e-3)*10



In [ ]:

    
gs_results_dir = 'gs_results_little.csv'
df_GS = pd.read_csv(data_dir+gs_results_dir,header=None)



In [ ]:

    
df_GS.sort_values(7,ascending=False)



In [ ]:

    
help(df_GS.sort_values,ascending=True)



In [ ]:

    
gs_results_dir = 'gs_results_trim.csv'
df_GS_trim = pd.read_csv(data_dir+gs_results_dir,header=None)
df_GS_trim.sort_values(8)



In [ ]:

    
table = 'data_more'
dataFrame = pd.read_sql_query('select acct_id, count(*) as num_trans '
                       'from {table} '
                        ''
                       'group by acct_id '
                        'order by num_trans'
                       .format(table=table), disk_engine)
dataFrame



In [ ]:

    
encoders



In [ ]: